%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer
import re
# Tutorial about Python regular expressions: https://pymotw.com/2/re/
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
from tqdm import tqdm
import os
from chart_studio.plotly import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
project_data = pd.read_csv("train_data.csv", nrows = 30000)
resource_data = pd.read_csv("resources.csv", nrows = 30000)
print("Number of data points in train data", project_data.shape)
print('-'*50)
print("The attributes of data :", project_data.columns.values)
# Let's check for any "null" or "missing" values
project_data.info()
project_data['teacher_prefix'].isna().sum()
# "teacher_prefix" seems to contain 2 "missing" values, let't use mode replacement strategy to fill those missing values
project_data['teacher_prefix'].mode()
# Let's replace the missing values with "Mrs." , as it is the mode of the "teacher_prefix"
project_data['teacher_prefix'] = project_data['teacher_prefix'].fillna('Mrs.')
price_data = resource_data.groupby('id').agg({'price':'sum', 'quantity':'sum'}).reset_index()
project_data = pd.merge(project_data, price_data, on='id', how='left')
# Let's select only the selected features or columns, dropping "project_resource_summary" as it is optional
#
project_data.drop(['id','teacher_id','project_submitted_datetime','project_resource_summary'],axis=1, inplace=True)
project_data.columns
# Data seems to be highly imbalanced since the ratio of "class 1" to "class 0" is nearly 5.5
project_data['project_is_approved'].value_counts()
number_of_approved = project_data['project_is_approved'][project_data['project_is_approved'] == 1].count()
number_of_not_approved = project_data['project_is_approved'][project_data['project_is_approved'] == 0].count()
print("Ratio of Project approved to Not approved is:", number_of_approved/number_of_not_approved)
# merge two column text dataframe:
project_data["essay"] = project_data["project_essay_1"].map(str) +\
project_data["project_essay_2"].map(str) + \
project_data["project_essay_3"].map(str) + \
project_data["project_essay_4"].map(str)
project_data.head(2)
# Let's drop the project essay columns from the dadaset now, as we have captured the essay text data into single "essay" column
project_data.drop(['project_essay_1','project_essay_2','project_essay_3','project_essay_4'],axis=1, inplace=True)
y = project_data['project_is_approved'].values
X = project_data.drop(['project_is_approved'], axis=1)
X.head(1)
# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)
def cleaning_text_data(list_text_feature,df,old_col_name,new_col_name):
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
feature_list = []
for i in list_text_feature:
temp = ""
# consider we have text like this "Math & Science, Warmth, Care & Hunger"
for j in i.split(','): # it will split it in three parts ["Math & Science", "Warmth", "Care & Hunger"]
if 'The' in j.split(): # this will split each of the catogory based on space "Math & Science"=> "Math","&", "Science"
j=j.replace('The','') # if we have the words "The" we are going to replace it with ''(i.e removing 'The')
j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) ex:"Math & Science"=>"Math&Science"
temp+=j.strip()+" " #" abc ".strip() will return "abc", remove the trailing spaces
temp = temp.replace('&','_') # we are replacing the & value into
feature_list.append(temp.strip())
df[new_col_name] = feature_list
df.drop([old_col_name], axis=1, inplace=True)
from collections import Counter
my_counter = Counter()
for word in df[new_col_name].values:
my_counter.update(word.split())
feature_dict = dict(my_counter)
sorted_feature_dict = dict(sorted(feature_dict.items(), key=lambda kv: kv[1]))
return sorted_feature_dict
def clean_project_grade(list_text_feature,df,old_col_name,new_col_name):
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039
# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python
feature_list = []
for i in list_text_feature:
temp = i.split(' ')
last_dig = temp[-1].split('-')
fin = [temp[0]]
fin.extend(last_dig)
feature = '_'.join(fin)
feature_list.append(feature.strip())
df[new_col_name] = feature_list
df.drop([old_col_name], axis=1, inplace=True)
from collections import Counter
my_counter = Counter()
for word in df[new_col_name].values:
my_counter.update(word.split())
feature_dict = dict(my_counter)
sorted_feature_dict = dict(sorted(feature_dict.items(), key=lambda kv: kv[1]))
return sorted_feature_dict
x_train_sorted_category_dict = cleaning_text_data(X_train['project_subject_categories'],X_train,'project_subject_categories','clean_categories')
x_test_sorted_category_dict = cleaning_text_data(X_test['project_subject_categories'],X_test,'project_subject_categories','clean_categories')
x_train_sorted_subcategories = cleaning_text_data(X_train['project_subject_subcategories'],X_train,'project_subject_subcategories','clean_subcategories')
x_test_sorted_subcategories = cleaning_text_data(X_test['project_subject_subcategories'],X_test,'project_subject_subcategories','clean_subcategories')
x_train_sorted_grade = clean_project_grade(X_train['project_grade_category'],X_train,'project_grade_category','clean_grade')
x_test_sorted_grade = clean_project_grade(X_test['project_grade_category'],X_test,'project_grade_category','clean_grade')
# https://stackoverflow.com/a/47091490/4084039
import re
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"]
# Combining all the above stundents
from tqdm import tqdm
def process_text(df,col_name):
preprocessed_feature = []
# tqdm is for printing the status bar
for sentance in tqdm(df[col_name].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
preprocessed_feature.append(sent.lower().strip())
return preprocessed_feature
x_train_essay_preprocessed = process_text(X_train,'essay')
x_test_essay_preprocessed = process_text(X_test,'essay')
x_train_title_preprocessed = process_text(X_train,'project_title')
x_test_title_preprocessed = process_text(X_test,'project_title')
# We have following categorical features
# 1) school_state, 2) project_categories(clean_categories), 3) project_subcategories(clean_subcategories),
# 4) project_grade_categories(clean_grade), 5) teacher_prefix
def response_coded(x_train,y_train,x_test,y_test,col_name):
train_list = []
test_list = []
if (len(x_train[col_name]) != len(y_train)) or (len(x_test[col_name]) != len(y_test)):
return "rows mismatch"
train_set = list(set(x_train[col_name]))
test_set = list(set(x_test[col_name]))
# Let's process train data
for index in range(len(train_set)):
label_1 = 0
label_0 = 0
temp = {}
for attr,label in zip(x_train[col_name],y_train):
if train_set[index] == attr:
if label == 0:
label_0 += 1
else:
label_1 += 1
temp['attribute'] = train_set[index].lower()
temp['prob_label_1'] = label_1 / (label_1 + label_0)
temp['prob_label_0'] = label_0 / (label_1 + label_0)
train_list.append(temp)
# Let's process test data
for index in range(len(test_set)):
label_1 = 0
label_0 = 0
temp = {}
for attr,label in zip(x_test[col_name],y_test):
if test_set[index] == attr:
if label == 0:
label_0 += 1
else:
label_1 += 1
temp['attribute'] = test_set[index].lower()
temp['prob_label_1'] = label_1 / (label_1 + label_0)
temp['prob_label_0'] = label_0 / (label_1 + label_0)
test_list.append(temp)
# let's check for elements in test data that is not there in train data
in_test = list(set(x_test[col_name]) - set(x_train[col_name]))
if len(in_test) != 0:
for attr in in_test:
train_list.append({'attribute':attr.lower(),'prob_label_1':0.5,'prob_label_0':0.5})
test_list.append({'attribute':attr.lower(),'prob_label_1':0.5,'prob_label_0':0.5})
train_list_1 = [0] * len(x_train[col_name])
train_list_0 = [0] * len(x_train[col_name])
train_attr = ["-"] * len(x_train[col_name])
for index,value in enumerate(x_train[col_name]):
for tr in train_list:
if tr['attribute'] == value.lower():
train_list_1[index] = tr['prob_label_1']
train_list_0[index] = tr['prob_label_0']
train_attr[index] = tr['attribute']
test_list_1 = [0] * len(x_test[col_name])
test_list_0 = [0] * len(x_test[col_name])
test_attr = ["-"] * len(x_test[col_name])
for index, value in enumerate(x_test[col_name]):
for te in test_list:
if te['attribute'] == value.lower():
test_list_1[index] = te['prob_label_1']
test_list_0[index] = te['prob_label_0']
test_attr[index] = te['attribute']
return train_list_1, train_list_0, test_list_1, test_list_0
x_train_school_state_1,x_train_school_state_0, x_test_school_state_1,x_test_school_state_0 = response_coded(X_train,y_train,X_test,y_test,'school_state')
X_train['train_school_state_1'] = x_train_school_state_1
X_train['train_school_state_0'] = x_train_school_state_0
X_test['test_school_state_1'] = x_test_school_state_1
X_test['test_school_state_0'] = x_test_school_state_0
X_train['train_school_state_1'][:10]
X_train['train_school_state_0'][:10]
X_test['test_school_state_1'][:10]
X_test['test_school_state_0'][:10]
x_train_pro_cat_1,x_train_pro_cat_0, x_test_pro_cat_1,x_test_pro_cat_0 = response_coded(X_train,y_train,X_test,y_test,'clean_categories')
X_train['train_pro_cat_1'] = x_train_pro_cat_1
X_train['train_pro_cat_0'] = x_train_pro_cat_0
X_test['test_pro_cat_1'] = x_test_pro_cat_1
X_test['test_pro_cat_0'] = x_test_pro_cat_0
X_train['train_pro_cat_1'][:10]
X_train['train_pro_cat_0'][:10]
X_test['test_pro_cat_1'][:10]
X_test['test_pro_cat_0'][:10]
x_train_pro_subcat_1,x_train_pro_subcat_0, x_test_pro_subcat_1,x_test_pro_subcat_0 = response_coded(X_train,y_train,X_test,y_test,'clean_subcategories')
X_train['train_pro_subcat_1'] = x_train_pro_subcat_1
X_train['train_pro_subcat_0'] = x_train_pro_subcat_0
X_test['test_pro_subcat_1'] = x_test_pro_subcat_1
X_test['test_pro_subcat_0'] = x_test_pro_subcat_0
X_train['train_pro_subcat_1'][:10]
X_train['train_pro_subcat_0'][:10]
X_test['test_pro_subcat_1'][:10]
X_test['test_pro_subcat_0'][:10]
x_train_grade_1,x_train_grade_0, x_test_grade_1,x_test_grade_0 = response_coded(X_train,y_train,X_test,y_test,'clean_grade')
X_train['train_grade_1'] = x_train_grade_1
X_train['train_grade_0'] = x_train_grade_0
X_test['test_grade_1'] = x_test_grade_1
X_test['test_grade_0'] = x_test_grade_0
X_train['train_grade_1'][:10]
X_train['train_grade_0'][:10]
X_test['test_grade_1'][:10]
X_test['test_grade_0'][:10]
x_train_prefix_1,x_train_prefix_0, x_test_prefix_1, x_test_prefix_0 = response_coded(X_train,y_train,X_test,y_test,'teacher_prefix')
X_train['train_prefix_1'] = x_train_prefix_1
X_train['train_prefix_0'] = x_train_prefix_0
X_test['test_prefix_1'] = x_test_prefix_1
X_test['test_prefix_0'] = x_test_prefix_0
X_train['train_prefix_1'][:10]
X_train['train_prefix_0'][:10]
X_test['test_prefix_1'][:10]
X_test['test_prefix_0'][:10]
def bow_vectorizer(X_train,col_name,df):
vectorizer = CountVectorizer()
vectorizer.fit(X_train[col_name].values)
df_bow = vectorizer.transform(df[col_name].values)
return df_bow, vectorizer.get_feature_names()
x_train_essay_bow, x_train_essay_feat = bow_vectorizer(X_train,'essay',X_train)
x_test_essay_bow, x_test_essay_feat = bow_vectorizer(X_train,'essay',X_test)
print(x_train_essay_bow.shape)
print(x_test_essay_bow.shape)
def bow_vectorizer_title(X_train,col_name,df):
vectorizer = CountVectorizer()
vectorizer.fit(X_train[col_name].values)
df_bow = vectorizer.transform(df[col_name].values)
return df_bow, vectorizer.get_feature_names()
x_train_title_bow, x_train_title_feat = bow_vectorizer_title(X_train,'project_title',X_train)
x_test_title_bow, x_test_title_feat = bow_vectorizer_title(X_train,'project_title',X_test)
print(x_train_title_bow.shape)
print(x_test_title_bow.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_vectorizer(X_train,col_name,df):
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train[col_name].values)
df_tfidf = vectorizer.transform(df[col_name].values)
return df_tfidf, vectorizer.get_feature_names()
# Lets vectorize essay
x_train_essay_tfidf, x_train_essay_tfidf_feat = tfidf_vectorizer(X_train,'essay',X_train)
x_test_essay_tfidf, x_test_essay_tfidf_feat = tfidf_vectorizer(X_train,'essay',X_test)
print(x_train_essay_tfidf.shape)
print(x_test_essay_tfidf.shape)
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_vectorizer_title(X_train,col_name,df):
vectorizer = TfidfVectorizer()
vectorizer.fit(X_train[col_name].values)
df_tfidf = vectorizer.transform(df[col_name].values)
return df_tfidf, vectorizer.get_feature_names()
# Lets vectorize essay
x_train_title_tfidf, x_train_title_tfidf_feat = tfidf_vectorizer_title(X_train,'project_title',X_train)
x_test_title_tfidf, x_test_title_tfidf_feat = tfidf_vectorizer_title(X_train,'project_title',X_test)
print(x_train_title_tfidf.shape)
print(x_test_title_tfidf.shape)
'''
# Reading glove vectors in python: https://stackoverflow.com/a/38230349/4084039
def loadGloveModel(gloveFile):
print ("Loading Glove Model")
f = open(gloveFile,'r', encoding="utf8")
model = {}
for line in tqdm(f):
splitLine = line.split()
word = splitLine[0]
embedding = np.array([float(val) for val in splitLine[1:]])
model[word] = embedding
print ("Done.",len(model)," words loaded!")
return model
model = loadGloveModel('glove.42B.300d.txt')
# ============================
Output:
Loading Glove Model
1917495it [06:32, 4879.69it/s]
Done. 1917495 words loaded!
# ============================
words = []
for i in preproced_texts:
words.extend(i.split(' '))
for i in preproced_titles:
words.extend(i.split(' '))
print("all the words in the coupus", len(words))
words = set(words)
print("the unique words in the coupus", len(words))
inter_words = set(model.keys()).intersection(words)
print("The number of words that are present in both glove vectors and our coupus", \
len(inter_words),"(",np.round(len(inter_words)/len(words)*100,3),"%)")
words_courpus = {}
words_glove = set(model.keys())
for i in words:
if i in words_glove:
words_courpus[i] = model[i]
print("word 2 vec length", len(words_courpus))
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
import pickle
with open('glove_vectors', 'wb') as f:
pickle.dump(words_courpus, f)
'''
# stronging variables into pickle files python: http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
# make sure you have the glove_vectors file
with open('glove_vectors', 'rb') as f:
model = pickle.load(f)
glove_words = set(model.keys())
# Combining all the above stundents
from tqdm import tqdm
def preprocess_essay(df,col_name):
preprocessed_essays = []
# tqdm is for printing the status bar
for sentance in tqdm(df[col_name].values):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\"', ' ')
sent = sent.replace('\\n', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e not in stopwords)
preprocessed_essays.append(sent.lower().strip())
return preprocessed_essays
# average Word2Vec
# compute average word2vec for each review.
def compute_avg_W2V(preprocessed_feature):
avg_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(preprocessed_feature): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
cnt_words =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if word in glove_words:
vector += model[word]
cnt_words += 1
if cnt_words != 0:
vector /= cnt_words
avg_w2v_vectors.append(vector)
return avg_w2v_vectors
x_train_preprocessed_essay = preprocess_essay(X_train,'essay')
x_test_preprocessed_essay = preprocess_essay(X_test,'essay')
x_train_preprocessed_title = preprocess_essay(X_train,'project_title')
x_test_preprocessed_title = preprocess_essay(X_test,'project_title')
x_train_avg_w2v_essay = compute_avg_W2V(x_train_preprocessed_essay)
x_test_avg_w2v_essay = compute_avg_W2V(x_test_preprocessed_essay)
x_train_avg_w2v_title = compute_avg_W2V(x_train_preprocessed_title)
x_test_avg_w2v_title = compute_avg_W2V(x_test_preprocessed_title)
# S = ["abc def pqr", "def def def abc", "pqr pqr def"]
def get_tfidf_dict(preprocessed_feature):
tfidf_model = TfidfVectorizer()
tfidf_model.fit(preprocessed_feature)
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
return dictionary, tfidf_words
# average Word2Vec
# compute average word2vec for each review.
def compute_tfidf_w2v_vectors(preprocessed_feature):
tfidf_w2v_vectors = []; # the avg-w2v for each sentence/review is stored in this list
dictionary, tfidf_words = get_tfidf_dict(preprocessed_feature)
for sentence in tqdm(preprocessed_feature): # for each review/sentence
vector = np.zeros(300) # as word vectors are of zero length
tf_idf_weight =0; # num of words with a valid vector in the sentence/review
for word in sentence.split(): # for each word in a review/sentence
if (word in glove_words) and (word in tfidf_words):
vec = model[word] # getting the vector for each word
# here we are multiplying idf value(dictionary[word]) and the tf value((sentence.count(word)/len(sentence.split())))
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split())) # getting the tfidf value for each word
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
tfidf_w2v_vectors.append(vector)
return tfidf_w2v_vectors
x_train_weighted_w2v_essay = compute_tfidf_w2v_vectors(x_train_essay_preprocessed)
x_test_weighted_w2v_essay= compute_tfidf_w2v_vectors(x_test_essay_preprocessed)
x_train_weighted_w2v_title = compute_tfidf_w2v_vectors(x_train_title_preprocessed)
x_test_weighted_w2v_title= compute_tfidf_w2v_vectors(x_test_title_preprocessed)
We have 2 numerical features left, "price" and "teacher_number_of_previously_posted_projects". Let's check for the "missing" or "NaN" values present in those numerical features and use "Mean Replacement" for "price" and "Mode Replacement" for "teacher_number_of_previously_posted_projects".
print("Total number of \"Missing\" Values present in X_train price:",X_train['price'].isna().sum())
print("Total number of \"Missing\" Values present in X_test price:",X_test['price'].isna().sum())
print("Total number of \"Missing\" Values present in X_train previous teacher number:",X_train['teacher_number_of_previously_posted_projects'].isna().sum())
print("Total number of \"Missing\" Values present in X_test previous teacher number:",X_test['teacher_number_of_previously_posted_projects'].isna().sum())
print("Total number of \"Missing\" Values present in X_train quantity:",X_train['quantity'].isna().sum())
print("Total number of \"Missing\" Values present in X_test quantity:",X_test['quantity'].isna().sum())
"teacher_number_of_previously_posted_projects" does not have any "missing" values.
X_train['price'].mean()
X_train['price'] = X_train['price'].fillna(274.0266)
X_test['price'].mean()
X_test['price'] = X_test['price'].fillna(288.2436)
print(X_train['quantity'].mean())
print(X_test['quantity'].mean())
X_train['quantity'] = X_train['quantity'].fillna(18.0209)
X_test['quantity'] = X_test['quantity'].fillna(19.9675)
# check this one: https://www.youtube.com/watch?v=0HOqOcln3Z4&t=530s
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
from sklearn.preprocessing import StandardScaler
def scaler_function(df,col_name):
scaler = StandardScaler()
scaler.fit(df[col_name].values.reshape(-1,1)) # finding the mean and standard deviation of this data
# Now standardize the data with above maen and variance.
print(f"Mean : {scaler.mean_[0]}, Standard deviation : {np.sqrt(scaler.var_[0])}")
scaled = scaler.transform(df[col_name].values.reshape(-1, 1))
return scaled
x_train_teacher_number = scaler_function(X_train,'teacher_number_of_previously_posted_projects')
x_test_teacher_number = scaler_function(X_test,'teacher_number_of_previously_posted_projects')
x_train_price = scaler_function(X_train,'price')
x_test_price = scaler_function(X_test,'price')
x_train_quantity = scaler_function(X_train,'quantity')
x_test_quantity = scaler_function(X_test,'quantity')
# train dataset
print("After Vectorization and One hot encoding train dataset shape becomes:")
print(np.asarray(X_train['train_pro_cat_1']).shape)
print(np.asarray(X_train['train_pro_cat_0']).shape)
print(np.asarray(X_train['train_pro_subcat_1']).shape)
print(np.asarray(X_train['train_pro_subcat_0']).shape)
print(np.asarray(X_train['train_school_state_1']).shape)
print(np.asarray(X_train['train_school_state_0']).shape)
print(np.asarray(X_train['train_prefix_1']).shape)
print(np.asarray(X_train['train_prefix_0']).shape)
print(np.asarray(X_train['train_grade_1']).shape)
print(np.asarray(X_train['train_grade_1']).shape)
print(x_train_essay_bow.shape)
print(x_train_title_bow.shape)
print(x_train_essay_tfidf.shape)
print(x_train_title_tfidf.shape)
print(np.asarray(x_train_avg_w2v_essay).shape)
print(np.asarray(x_train_avg_w2v_title).shape)
print(np.asarray(x_train_weighted_w2v_essay).shape)
print(np.asarray(x_train_weighted_w2v_title).shape)
print(x_train_teacher_number.shape)
print(x_train_price.shape)
print(x_train_quantity.shape)
print("="*50)
# test dataset
print("After Vectorization and One hot encoding test dataset shape becomes:")
print(np.asarray(X_test['test_pro_cat_1']).shape)
print(np.asarray(X_test['test_pro_cat_0']).shape)
print(np.asarray(X_test['test_pro_subcat_1']).shape)
print(np.asarray(X_test['test_pro_subcat_0']).shape)
print(np.asarray(X_test['test_school_state_1']).shape)
print(np.asarray(X_test['test_school_state_0']).shape)
print(np.asarray(X_test['test_prefix_1']).shape)
print(np.asarray(X_test['test_prefix_0']).shape)
print(np.asarray(X_test['test_grade_1']).shape)
print(np.asarray(X_test['test_grade_1']).shape)
print(x_test_essay_bow.shape)
print(x_test_title_bow.shape)
print(x_test_essay_tfidf.shape)
print(x_test_title_tfidf.shape)
print(np.asarray(x_test_avg_w2v_essay).shape)
print(np.asarray(x_test_avg_w2v_title).shape)
print(np.asarray(x_test_weighted_w2v_essay).shape)
print(np.asarray(x_test_weighted_w2v_title).shape)
print(x_test_teacher_number.shape)
print(x_test_price.shape)
print(x_test_quantity.shape)
print("="*50)
def enable_plotly_in_cell():
import IPython
from plotly.offline import init_notebook_mode
display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
init_notebook_mode(connected=False)
%matplotlib inline
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
def plot_3d_plot(x_tr, y_tr, x_te, y_te, depth_list, split_list,algo):
auc_tr = []
auc_te = []
y_train_pred_prob = []
y_test_pred_prob = []
for depth, split in zip(depth_list ,split_list):
if algo == 'RF_':
clf_ = RandomForestClassifier(max_depth = depth ,min_samples_split = split, class_weight = "balanced")
clf_.fit(x_tr,y_tr)
elif algo == "GB_":
clf_ = GradientBoostingClassifier(max_depth = depth ,min_samples_split = split)
clf_.fit(x_tr,y_tr)
y_train_pred = clf_.predict_proba(x_tr)
y_test_pred = clf_.predict_proba(x_te)
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_tr, y_train_pred_prob)
test_fpr, test_tpr, tc_thresholds = roc_curve(y_te, y_test_pred_prob)
y_train_pred_prob = []
y_test_pred_prob = []
auc_tr.append(auc(train_fpr,train_tpr))
auc_te.append(auc(test_fpr,test_tpr))
X = split_list
Y = depth_list
Z1 = auc_tr
Z2 = auc_te
# https://plot.ly/python/3d-axes/
trace1 = go.Scatter3d(x=X,y=Y,z=Z1, name = 'train')
trace2 = go.Scatter3d(x=X,y=Y,z=Z2, name = 'cv')
data = [trace1,trace2]
enable_plotly_in_cell()
layout = go.Layout(scene = dict(
xaxis = dict(title='min_samples_split'),
yaxis = dict(title='max_depth'),
zaxis = dict(title='AUC'),))
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig, filename='3d-scatter-colorscale')
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
from scipy.sparse import hstack
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
X_train_set_1 = hstack((X_train['train_pro_cat_1'].values.reshape(-1,1), X_train['train_pro_cat_0'].values.reshape(-1,1), X_train['train_pro_subcat_1'].values.reshape(-1,1), X_train['train_pro_subcat_0'].values.reshape(-1,1), X_train['train_school_state_1'].values.reshape(-1,1), X_train['train_school_state_0'].values.reshape(-1,1), X_train['train_grade_1'].values.reshape(-1,1),\
X_train['train_grade_0'].values.reshape(-1,1),X_train['train_prefix_1'].values.reshape(-1,1),X_train['train_prefix_0'].values.reshape(-1,1),x_train_teacher_number,x_train_price,x_train_quantity,x_train_title_bow,x_train_essay_bow)).tocsr()
X_test_set_1 = hstack((X_test['test_pro_cat_1'].values.reshape(-1,1),X_test['test_pro_cat_0'].values.reshape(-1,1),X_test['test_pro_subcat_1'].values.reshape(-1,1),X_test['test_pro_subcat_0'].values.reshape(-1,1),X_test['test_school_state_1'].values.reshape(-1,1),X_test['test_school_state_0'].values.reshape(-1,1),X_test['test_grade_1'].values.reshape(-1,1),\
X_test['test_grade_0'].values.reshape(-1,1),X_test['test_prefix_1'].values.reshape(-1,1),X_test['test_prefix_0'].values.reshape(-1,1),x_test_teacher_number,x_test_price,x_test_quantity,x_test_title_bow,x_test_essay_bow)).tocsr()
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import math
RF_ = RandomForestClassifier(class_weight = "balanced")
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000],'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(RF_, parameters,n_iter = 9, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_1, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(param_max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results
# the best value for n_estimators and max_depth from the above table
best_n_estimator = 200
best_max_depth = 9
from sklearn.metrics import roc_curve, auc
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
from sklearn.metrics import roc_curve, auc
RF_ = RandomForestClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth, class_weight = "balanced")
RF_.fit(X_train_set_1, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = RF_.predict_proba(X_train_set_1)
y_test_pred = RF_.predict_proba(X_test_set_1)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
scores = [results['mean_train_score'].values,results['mean_test_score'].values]
sns.heatmap(np.asarray(scores),annot = True, cbar=False)
# we are writing our own function for predict, with defined thresould
# we will pick a threshold that will give the least fpr
def find_best_threshold(threshould, fpr, tpr):
t = threshould[np.argmax(tpr*(1-fpr))]
# (tpr*(1-fpr)) will be maximum if your fpr is very low and tpr is very high
print("the maximum value of tpr*(1-fpr)", max(tpr*(1-fpr)), "for threshold", np.round(t,3))
return t
def predict_with_best_t(proba, threshould):
predictions = []
for i in proba:
if i>=threshould:
predictions.append(1)
else:
predictions.append(0)
return predictions
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_1, y_train, X_test_set_1, y_test, depth, estimators,"RF_")
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
from scipy.sparse import hstack
X_train_set_2 = hstack((X_train['train_pro_cat_1'].values.reshape(-1,1), X_train['train_pro_cat_0'].values.reshape(-1,1), X_train['train_pro_subcat_1'].values.reshape(-1,1), X_train['train_pro_subcat_0'].values.reshape(-1,1), X_train['train_school_state_1'].values.reshape(-1,1), X_train['train_school_state_0'].values.reshape(-1,1), X_train['train_grade_1'].values.reshape(-1,1),\
X_train['train_grade_0'].values.reshape(-1,1),X_train['train_prefix_1'].values.reshape(-1,1),X_train['train_prefix_0'].values.reshape(-1,1),x_train_teacher_number,x_train_price,x_train_quantity,x_train_title_tfidf,x_train_essay_tfidf)).tocsr()
X_test_set_2 = hstack((X_test['test_pro_cat_1'].values.reshape(-1,1),X_test['test_pro_cat_0'].values.reshape(-1,1),X_test['test_pro_subcat_1'].values.reshape(-1,1),X_test['test_pro_subcat_0'].values.reshape(-1,1),X_test['test_school_state_1'].values.reshape(-1,1),X_test['test_school_state_0'].values.reshape(-1,1),X_test['test_grade_1'].values.reshape(-1,1),\
X_test['test_grade_0'].values.reshape(-1,1),X_test['test_prefix_1'].values.reshape(-1,1),X_test['test_prefix_0'].values.reshape(-1,1),x_test_teacher_number,x_test_price,x_test_quantity,x_test_title_tfidf,x_test_essay_tfidf)).tocsr()
RF_ = RandomForestClassifier(class_weight = "balanced")
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000],'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(RF_, parameters,n_iter = 9, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_2, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# the best value for n_estimators and max_depth from the above table are
best_n_estimator = 300
best_max_depth = 8
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
RF_ = RandomForestClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth, class_weight = "balanced")
RF_.fit(X_train_set_2, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = RF_.predict_proba(X_train_set_2)
y_test_pred = RF_.predict_proba(X_test_set_2)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores), annot=True, cbar= False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_2, y_train, X_test_set_2, y_test, depth, estimators,"RF_")
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
from scipy.sparse import csr_matrix
X_train_set_3 = hstack((csr_matrix(X_train['train_pro_cat_1'].values).reshape(-1,1), csr_matrix(X_train['train_pro_cat_0'].values).reshape(-1,1),csr_matrix(X_train['train_pro_subcat_1'].values).reshape(-1,1), csr_matrix(X_train['train_pro_subcat_0'].values).reshape(-1,1),csr_matrix(X_train['train_school_state_1'].values).reshape(-1,1),csr_matrix(X_train['train_school_state_0'].values).reshape(-1,1),\
csr_matrix( X_train['train_grade_1'].values).reshape(-1,1),csr_matrix( X_train['train_grade_0'].values).reshape(-1,1),csr_matrix(X_train['train_prefix_1'].values).reshape(-1,1),csr_matrix(X_train['train_prefix_0'].values).reshape(-1,1),x_train_teacher_number,x_train_price,x_train_quantity,x_train_avg_w2v_title, x_train_avg_w2v_essay)).tocsr()
X_test_set_3 = hstack((csr_matrix(X_test['test_pro_cat_1'].values).reshape(-1,1),csr_matrix(X_test['test_pro_cat_0'].values).reshape(-1,1),csr_matrix(X_test['test_pro_subcat_1'].values).reshape(-1,1),csr_matrix(X_test['test_pro_subcat_0'].values).reshape(-1,1),csr_matrix(X_test['test_school_state_1'].values).reshape(-1,1),csr_matrix(X_test['test_school_state_0'].values).reshape(-1,1),csr_matrix(X_test['test_grade_1'].values).reshape(-1,1),\
csr_matrix(X_test['test_grade_0'].values).reshape(-1,1),csr_matrix(X_test['test_prefix_1'].values).reshape(-1,1),csr_matrix(X_test['test_prefix_0'].values).reshape(-1,1),x_test_teacher_number,x_test_price,x_test_quantity, x_test_avg_w2v_title,x_test_avg_w2v_essay)).tocsr()
RF_ = RandomForestClassifier(class_weight = "balanced")
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000],'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(RF_, parameters,n_iter = 9, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_3, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
results
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# the best value for n_estimators and max_depth from the above graphs are
best_n_estimator = 500
best_max_depth = 4
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
RF_ = RandomForestClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth, class_weight = "balanced")
RF_.fit(X_train_set_3, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = RF_.predict_proba(X_train_set_3)
y_test_pred = RF_.predict_proba(X_test_set_3)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores), annot = True, cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_3, y_train, X_test_set_3, y_test, depth, estimators, "RF_")
# merge two sparse matrices: https://stackoverflow.com/a/19710648/4084039
# with the same hstack function we are concatinating a sparse matrix and a dense matirx :)
X_train_set_4 = hstack((csr_matrix(X_train['train_pro_cat_1'].values).reshape(-1,1), csr_matrix(X_train['train_pro_cat_0'].values).reshape(-1,1),csr_matrix(X_train['train_pro_subcat_1'].values).reshape(-1,1), csr_matrix(X_train['train_pro_subcat_0'].values).reshape(-1,1),csr_matrix(X_train['train_school_state_1'].values).reshape(-1,1),csr_matrix(X_train['train_school_state_0'].values).reshape(-1,1),\
csr_matrix( X_train['train_grade_1'].values).reshape(-1,1),csr_matrix( X_train['train_grade_0'].values).reshape(-1,1),csr_matrix(X_train['train_prefix_1'].values).reshape(-1,1),csr_matrix(X_train['train_prefix_0'].values).reshape(-1,1),x_train_teacher_number,x_train_price,x_train_quantity,x_train_weighted_w2v_title, x_train_weighted_w2v_essay)).tocsr()
X_test_set_4 = hstack((csr_matrix(X_test['test_pro_cat_1'].values).reshape(-1,1),csr_matrix(X_test['test_pro_cat_0'].values).reshape(-1,1),csr_matrix(X_test['test_pro_subcat_1'].values).reshape(-1,1),csr_matrix(X_test['test_pro_subcat_0'].values).reshape(-1,1),csr_matrix(X_test['test_school_state_1'].values).reshape(-1,1),csr_matrix(X_test['test_school_state_0'].values).reshape(-1,1),csr_matrix(X_test['test_grade_1'].values).reshape(-1,1),\
csr_matrix(X_test['test_grade_0'].values).reshape(-1,1),csr_matrix(X_test['test_prefix_1'].values).reshape(-1,1),csr_matrix(X_test['test_prefix_0'].values).reshape(-1,1),x_test_teacher_number,x_test_price,x_test_quantity, x_test_weighted_w2v_title,x_test_weighted_w2v_essay)).tocsr()
RF_ = RandomForestClassifier(class_weight = "balanced")
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000],'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(RF_, parameters,n_iter = 9, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_4, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
results
# the best value for n_estimators and max_depth from the above table are
best_n_estimator = 10
best_max_depth = 9
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
RF_ = RandomForestClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth, class_weight = "balanced")
RF_.fit(X_train_set_4, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = RF_.predict_proba(X_train_set_4)
y_test_pred = RF_.predict_proba(X_test_set_4)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores),annot = True,cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_4, y_train, X_test_set_4, y_test, depth, estimators, "RF_")
from sklearn.ensemble import GradientBoostingClassifier
GB_ = GradientBoostingClassifier()
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000], 'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(GB_, parameters,n_iter = 8, scoring='roc_auc', return_train_score=True)
clf.fit(X_train_set_1, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
results
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# the best value for n_estimators and max_depth from the above graphs are
best_n_estimator = 50
best_max_depth = 4
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
GB_ = GradientBoostingClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth)
GB_.fit(X_train_set_1, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = GB_.predict_proba(X_train_set_1)
y_test_pred = GB_.predict_proba(X_test_set_1)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores),annot = True, cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_1, y_train, X_test_set_1, y_test, depth, estimators, algo = "GB_")
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
import math
GB_ = GradientBoostingClassifier()
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000],'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(GB_, parameters,n_iter = 9, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_2, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_max_depth'])
results
# the best value for n_estimators and max_depth from the above graphs are
best_n_estimator = 50
best_max_depth = 9
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
GB_ = GradientBoostingClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth)
GB_.fit(X_train_set_2, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = GB_.predict_proba(X_train_set_2)
y_test_pred = GB_.predict_proba(X_test_set_2)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores),annot = True, cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_2, y_train, X_test_set_2, y_test, depth, estimators, algo = 'GB_')
GB_ = GradientBoostingClassifier()
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000], 'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(GB_, parameters,n_iter = 8, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_3, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
results
# the best value for n_estimators and max_depth from the above graphs are
best_n_estimator = 150
best_max_depth = 3
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
GB_ = GradientBoostingClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth)
GB_.fit(X_train_set_3, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = GB_.predict_proba(X_train_set_3)
y_test_pred = GB_.predict_proba(X_test_set_3)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores), annot = True, cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_3, y_train, X_test_set_3, y_test, depth, estimators, algo = "GB_")
GB_ = GradientBoostingClassifier()
parameters = {'n_estimators':[10, 50, 100, 150, 200, 300, 500, 1000], 'max_depth':[2,3,4,5,6,7,8,9,10]}
clf = RandomizedSearchCV(GB_, parameters,n_iter = 8, scoring='roc_auc', return_train_score = True)
clf.fit(X_train_set_4, y_train)
results = pd.DataFrame.from_dict(clf.cv_results_)
results = results.sort_values(['param_n_estimators'])
results
# the best value for n_estimators and max_depth from the above graphs are
best_n_estimator = 10
best_max_depth = 10
results = results.sort_values(['param_n_estimators'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_n_estimators'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(n_estimators): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
results = results.sort_values(['param_max_depth'])
train_auc= results['mean_train_score']
train_auc_std= results['std_train_score']
cv_auc = results['mean_test_score']
cv_auc_std= results['std_test_score']
C_ = results['param_max_depth'].apply(lambda x: math.log10(x))
plt.plot(C_, train_auc, label='Train AUC')
plt.plot(C_, cv_auc, label='CV AUC')
plt.scatter(C_, train_auc, label='Train AUC points')
plt.scatter(C_, cv_auc, label='CV AUC points')
plt.legend()
plt.xlabel("log10(max_depth): hyperparameter")
plt.ylabel("AUC")
plt.title("Hyper parameter Vs AUC plot")
plt.grid()
plt.show()
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
GB_ = GradientBoostingClassifier(n_estimators = best_n_estimator, max_depth = best_max_depth)
GB_.fit(X_train_set_4, y_train)
# roc_auc_score(y_true, y_score) the 2nd parameter should be probability estimates of the positive class
# not the predicted outputs
y_train_pred = GB_.predict_proba(X_train_set_4)
y_test_pred = GB_.predict_proba(X_test_set_4)
y_train_pred_prob = []
y_test_pred_prob = []
for index in range(len(y_train_pred)):
y_train_pred_prob.append(y_train_pred[index][1])
for index in range(len(y_test_pred)):
y_test_pred_prob.append(y_test_pred[index][1])
train_fpr, train_tpr, tr_thresholds = roc_curve(y_train, y_train_pred_prob)
test_fpr, test_tpr, te_thresholds = roc_curve(y_test, y_test_pred_prob)
plt.plot(train_fpr, train_tpr, label="train AUC ="+str(auc(train_fpr, train_tpr)))
plt.plot(test_fpr, test_tpr, label="test AUC ="+str(auc(test_fpr, test_tpr)))
plt.legend()
plt.xlabel("tpr")
plt.ylabel("fpr")
plt.title("ROC PLOTS for train,test")
plt.grid()
plt.show()
print("="*100)
from sklearn.metrics import confusion_matrix
best_t = find_best_threshold(tr_thresholds, train_fpr, train_tpr)
print("Train confusion matrix")
sns.heatmap(confusion_matrix(y_train, predict_with_best_t(y_train_pred_prob, best_t)),annot = True, fmt = "d", cbar=False)
print("Test confusion matrix")
sns.heatmap(confusion_matrix(y_test, predict_with_best_t(y_test_pred_prob, best_t)), annot = True, fmt = "d", cbar=False)
scores = [results['mean_train_score'].values, results['mean_test_score'].values]
sns.heatmap(np.asarray(scores), annot = True, cbar=False)
depth = [2,3,4,5,6,7,8,9,10]
estimators = [10,50,100,150,200,300,500]
plot_3d_plot(X_train_set_4, y_train, X_test_set_4, y_test, depth, estimators, algo = 'GB_')
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Model", "Dataset","best_n_estimator", "best_max_depth", "Train AUC", "Test AUC"]
x.add_row(["RandomForestClassfier", "BOW", 200, 9, 0.8760, 0.7002])
x.add_row(["RandomForestClassfier", "TFIDF", 300, 8, 0.8891, 0.6963])
x.add_row(["RandomForestClassfier", "AVG W2V", 500, 4, 0.7576, 0.6950])
x.add_row(["RandomForestClassfier", "TFIDF W2V", 10, 9, 0.9195, 0.6524])
x.add_row(["GradientBoostingClassifier", "BOW", 50, 4, 0.7943, 0.6916])
x.add_row(["GradientBoostingClassifier", "TFIDF", 50, 9, 0.9718, 0.6871])
x.add_row(["GradientBoostingClassifier", "AVG W2v", 150, 3, 0.8482, 0.7031])
x.add_row(["GradientBoostingClassifier", "TFIDF W2V", 10, 10, 0.9765, 0.6468])
print(x)